knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(ggplot2)
library(dplyr)
library(tidymodels)
library(lubridate)
library(rpart.plot)
library(cluster)
library(forcats)
tidymodels_prefer()
conflicted::conflict_prefer("vi", "vip")
load("accident_cleanest.Rdata")
accidents = accident_clean
# how we cleaned the data
# accident_clean <- accidents %>%
# filter(year(Start_Time) >= 2018, year(Start_Time) <= 2019) %>%
# drop_na(.) %>%
# select(-Start_Lat, -Start_Lng, -End_Lat, -End_Lng, -City, -State, -Description, -Number, - Street, -Side, -County, -Zipcode, -Country, -Timezone, -Airport_Code, -Weather_Timestamp, -`Wind_Chill(F)`, -`Humidity(%)`, -`Pressure(in)`, -Wind_Direction, -`Precipitation(in)`, -Amenity, -Bump, -Give_Way, -No_Exit, -Railway, -Roundabout, - Station, -Stop, -Traffic_Calming, -Turning_Loop, -Civil_Twilight, -Nautical_Twilight, -Astronomical_Twilight) %>%
# sample_frac(size = 1/5) %>%
# mutate(Crossing = if_else(Crossing, 1, 0)) %>%
# mutate(Junction = if_else(Junction, 1, 0)) %>%
# mutate(Traffic_Signal = if_else(Traffic_Signal, 1, 0)) %>%
# mutate(logDist = log(`Distance(mi)`+.1)) %>%
# mutate(Duration = round(End_Time - Start_Time)) %>%
# rename(Temp = `Temperature(F)`) %>%
# rename(Wind = `Wind_Speed(mph)`) %>%
# rename(Vis = `Visibility(mi)`) %>%
# mutate(dayofweek = lubridate::wday(Start_Time), month = month(Start_Time)) %>%
# select(-`Distance(mi)`, - End_Time) %>%
# mutate(Severity = as.factor(Severity))
set.seed(253)
accidents <- accidents %>% filter(Duration < 10000) %>% mutate(dayofweek = as.factor(dayofweek)) #Brianna added
accident_cv <- vfold_cv(accidents, v = 10) # this is the random part
darkRose = "#741b47"
rose = "#ead1dc"
darkBlu = "#073763"
midPurp = "#3e2956"
roses = c("#73666c", "#734d60", "#733453")
blues = c("#435463", "#2f4a63", "#1b4063")
#training(accident_cv$splits[[1]]) # pulls training data for the 1st split (1st fold is testing set)
#testing(accident_cv$splits[[1]]) # pulls testing data for the 1st split (1st fold is testing set)
lm engine and LASSO (glmnet engine) to build a series of initial regression models for your quantitative outcome as a function of the predictors of interest. (As part of data cleaning, exclude any variables that you don’t want to consider as predictors.)lm_spec and lm_lasso_spec (you’ll need to tune this one).lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
lm_lasso_spec <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>%
set_engine(engine = 'glmnet') %>%
set_mode('regression')
recipe with the formula, data, and pre-processing stepsstep_nzv()), remove variables that are highly correlated with other variables (step_corr()), normalize all quantitative predictors (step_normalize(all_numeric_predictors())) and add indicator variables for any categorical variables (step_dummy(all_nominal_predictors())).car_rec <- recipe(logDist ~ Vis + Temp + Wind + Duration + hour, data = accidents) %>%
step_nzv(all_predictors()) %>%
step_other(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
car_all_rec <- recipe(logDist ~ ., data = accidents) %>% #Brianna added
step_nzv(all_predictors()) %>%
step_other(all_nominal_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
model_wf <- workflow() %>%
add_recipe(car_rec) %>%
add_model(lm_spec)
fit_model <- model_wf %>% fit(data = accidents)
car_all_rec %>% prep(accidents) %>% juice()
## # A tibble: 75,061 × 28
## Temp Wind Junction Traffic_Signal Duration month hour logDist
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -1.30 -0.834 -0.428 -0.282 0.834 1.29 -1.47 -0.591
## 2 0.0421 -0.394 -0.428 -0.282 -0.747 1.29 -0.157 -1.36
## 3 -0.166 -0.623 -0.428 -0.282 1.73 -0.684 -0.909 -0.498
## 4 -0.529 -1.50 -0.428 -0.282 -0.492 1.00 -0.909 -2.30
## 5 1.08 -1.50 -0.428 -0.282 -0.747 -0.121 0.783 3.57
## 6 0.406 -0.355 -0.428 -0.282 0.834 -0.965 1.91 -0.0377
## 7 1.39 -0.164 -0.428 -0.282 0.0175 0.161 0.407 -2.30
## 8 1.86 -0.547 -0.428 3.55 -0.252 0.723 0.219 -2.30
## 9 0.614 -0.547 -0.428 3.55 -0.739 -0.121 -1.47 -1.38
## 10 -0.529 -0.394 2.34 -0.282 -0.739 1.29 -0.909 -0.528
## # … with 75,051 more rows, and 20 more variables: Severity_X3 <dbl>,
## # Severity_X4 <dbl>, Severity_new <dbl>, Weather_Condition_Cloudy <dbl>,
## # Weather_Condition_Fair <dbl>, Weather_Condition_Light.Rain <dbl>,
## # Weather_Condition_Mostly.Cloudy <dbl>, Weather_Condition_Overcast <dbl>,
## # Weather_Condition_Partly.Cloudy <dbl>, Weather_Condition_other <dbl>,
## # Weather_Condition_new <dbl>, Sunrise_Sunset_Night <dbl>,
## # Sunrise_Sunset_new <dbl>, dayofweek_X2 <dbl>, dayofweek_X3 <dbl>, …
lasso_wf_car <- workflow() %>%
add_recipe(car_all_rec) %>% #Brianna added
add_model(lm_lasso_spec)
mod1_cv <- fit_resamples(model_wf,
resamples = accident_cv,
metrics = metric_set(rmse, rsq, mae))
penalty_grid <- grid_regular(
penalty(range = c(-3, 1)), #log10 transformed
levels = 30)
tune_output <- tune_grid(
lasso_wf_car,
resamples = accident_cv,
metrics = metric_set(rmse, mae, rsq),
grid = penalty_grid)
best_se_penalty <- select_by_one_std_err(tune_output, metric = 'mae', desc(penalty))
best_se_penalty # choose penalty value based on the largest penalty within 1 se of the lowest CV MAE
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .bound
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.00356 mae standard 0.733 10 0.00185 Preprocessor1_Mod… 0.732 0.733
best_penalty <- select_best(tune_output, metric = 'mae')
best_penalty # choose penalty value based on lowest mae
## # A tibble: 1 × 2
## penalty .config
## <dbl> <chr>
## 1 0.001 Preprocessor1_Model01
autoplot(tune_output) +
theme_classic() +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.position = "none",
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu),
strip.background = element_blank(),
strip.text = element_text(color = darkBlu))
final_wf <- finalize_workflow(lasso_wf_car, best_penalty) # incorporates penalty value to workflow
final_wf_se <- finalize_workflow(lasso_wf_car, best_se_penalty)
final_fit <- fit(final_wf, data = accidents)
final_fit_se <- fit(final_wf_se, data = accidents)
tidy(final_fit)
## # A tibble: 28 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -1.12 0.001
## 2 Temp 0.0294 0.001
## 3 Wind 0.0352 0.001
## 4 Junction 0.0521 0.001
## 5 Traffic_Signal -0.192 0.001
## 6 Duration 0.00175 0.001
## 7 month -0.222 0.001
## 8 hour -0.0390 0.001
## 9 Severity_X3 0.467 0.001
## 10 Severity_X4 0.922 0.001
## # … with 18 more rows
tidy(final_fit_se) %>% arrange(desc(abs(estimate)))
## # A tibble: 28 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -1.12 0.00356
## 2 Severity_X4 0.913 0.00356
## 3 Severity_X3 0.460 0.00356
## 4 Weather_Condition_Fair -0.459 0.00356
## 5 Weather_Condition_Cloudy -0.292 0.00356
## 6 Weather_Condition_Partly.Cloudy -0.231 0.00356
## 7 month -0.224 0.00356
## 8 Traffic_Signal -0.189 0.00356
## 9 Weather_Condition_Mostly.Cloudy -0.143 0.00356
## 10 Weather_Condition_other -0.112 0.00356
## # … with 18 more rows
#LASSO Var Importance
glmnet_output <- final_fit_se %>% extract_fit_engine()
# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude <- glmnet_output$beta==0
# Loop over each variable
var_imp <- sapply(seq_len(nrow(bool_predictor_exclude)), function(row) {
this_coeff_path <- bool_predictor_exclude[row,]
if(sum(this_coeff_path) == ncol(bool_predictor_exclude)){ return(0)}else{
return(ncol(bool_predictor_exclude) - which.min(this_coeff_path) + 1)}
})
# Create a dataset of this information and sort
var_imp_data <- tibble(
var_name = rownames(bool_predictor_exclude),
var_imp = var_imp
)
var_imp_data %>% arrange(desc(var_imp)) %>% ggplot(aes(x = var_imp,y = var_name)) + geom_bar(stat='identity')
var_imp_data %>% ggplot(aes(x = var_imp,y = forcats::fct_reorder(var_name,var_imp), fill = var_imp)) +
geom_bar(stat='identity') +
theme_classic() +
labs(y = '',
x = 'Importance',
title = 'Lasso Model Variable Importance') +
scale_fill_gradient(low = darkBlu, high = darkRose) +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.position = "none",
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu))
std_error is readily available when you used collect_metrics(summarize=TRUE)).mod1_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 0.883 10 0.00143 Preprocessor1_Model1
## 2 rmse standard 1.05 10 0.00199 Preprocessor1_Model1
## 3 rsq standard 0.0373 10 0.000975 Preprocessor1_Model1
tune_output %>% collect_metrics() %>%
filter(penalty == (best_se_penalty %>% pull(penalty)))
## # A tibble: 3 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00356 mae standard 0.733 10 0.00185 Preprocessor1_Model05
## 2 0.00356 rmse standard 0.916 10 0.00236 Preprocessor1_Model05
## 3 0.00356 rsq standard 0.265 10 0.00253 Preprocessor1_Model05
mod1_output <- final_fit_se %>%
predict(new_data = accidents) %>% #this function maintains the row order of the new_data
bind_cols(accidents) %>%
mutate(resid = logDist - .pred)
mod1_output %>%
ggplot(aes(x = .pred, y = resid)) +
geom_hline(yintercept = 0, color = darkBlu) +
geom_point(color = darkRose, alpha = 0.2) +
geom_smooth(color = rose) +
theme_classic() +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.position = "none",
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu))
mod1_output %>%
ggplot(aes(x = Temp, y = resid)) +
geom_hline(yintercept = 0, color = darkBlu) +
geom_point(color = darkRose, alpha = 0.2) +
geom_smooth(color = rose) +
theme_classic() +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.position = "none",
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu))
mod1_output %>%
ggplot(aes(x = Wind, y = resid)) +
geom_hline(yintercept = 0, color = darkBlu) +
geom_point(color = darkRose, alpha = 0.2) +
geom_smooth(color = rose) +
theme_classic() +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.position = "none",
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu))
mod1_output %>%
ggplot(aes(x = Duration, y = resid)) +
geom_hline(yintercept = 0, color = darkBlu) +
geom_point(color = darkRose, alpha = 0.2) +
geom_smooth(color = rose) +
theme_classic() +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.position = "none",
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu))
mod1_output %>%
ggplot(aes(x = hour, y = resid)) +
geom_hline(yintercept = 0, color = darkBlu) +
geom_point(color = darkRose, alpha = 0.2) +
geom_smooth(color = rose) +
theme_classic() +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.position = "none",
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu))
Summarize investigations - Decide on an overall best model based on your investigations so far. To do this, make clear your analysis goals. Predictive accuracy? Interpretability? A combination of both?
Societal impact Are there any harms that may come from your analyses and/or how the data were collected?
One element of the way data was collected that may be harmful is the fact that this information is collected from API’s from Mapquest and Bing which would be tracking live traffic. A lot of the time, this traffic is collected from users of the apps which many times do not realize they have consented to allowing their location information to be collected. As a result, information on traffic duration is made up of individuals who may not want to contribute to the research. Similarly, another harm of the dataset could be the possible inclusion of multiples when recording the accidents. The data collectors took measures to filter out any duplicates, but there is a possibility that there are still some left in that could affect the integrity of our results.
What cautions do you want to keep in mind when communicating your work?
We want to make note that since the dataset is so extensive ( >1M cases), we needed to filter the dataset down to almost half of the original amount in order to be able to run our models. As a result, we are only looking at specific years, which may be reflective of whatever the conditions of overall traffic patterns are. For example, by excluding 2020, we are not measuring the effect COVID-19 had on traffic. Likewise, not incorporating 2016 and 2017 will not take into account the lower gas prices that may have led to higher traffic volumes. As a result, we need to take these and other possible shortcomings of the dataset into consideration.
Accounting for nonlinearity a. Update your OLS model(s) and LASSO model to use natural splines for the quantitative predictors. -You’ll need to update the recipe to include step_ns() for each quantitative predictor. -It’s recommended to use few knots (e.g., 2 knots = 3 degrees of freedom).
gam_spec <-
gen_additive_mod() %>%
set_engine(engine = 'mgcv') %>%
set_mode('regression')
gam_mod <- fit(gam_spec,
logDist ~ s(Temp) + s(Wind) + Weather_Condition + Crossing + Junction + Traffic_Signal +
Sunrise_Sunset + s(Duration) + dayofweek + s(hour) + s(month),
data = accidents) #Brianna adjusted
par(mfrow=c(2,2), bg = rose, col.lab = darkBlu, col.axis = darkBlu, col.main = darkBlu, fg = darkBlu)
gam_mod %>% pluck('fit') %>% mgcv::gam.check(col = darkRose)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 13 iterations.
## The RMS GCV score gradient at convergence was 1.384619e-07 .
## The Hessian was positive definite.
## Model rank = 130 / 130
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(Temp) 9.00 7.81 0.99 0.20
## s(Wind) 9.00 7.16 0.99 0.28
## s(Duration) 9.00 8.96 0.89 <2e-16 ***
## s(hour) 9.00 8.06 0.98 0.05 *
## s(month) 9.00 8.91 0.99 0.36
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam_mod %>% pluck('fit') %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## logDist ~ s(Temp) + s(Wind) + Weather_Condition + Crossing +
## Junction + Traffic_Signal + Sunrise_Sunset + s(Duration) +
## dayofweek + s(hour) + s(month)
##
## Parametric coefficients:
## Estimate Std. Error t value
## (Intercept) -1.699301 0.299158 -5.680
## Weather_ConditionBlowing Dust / Windy -0.113195 0.500291 -0.226
## Weather_ConditionBlowing Snow 1.470118 0.472961 3.108
## Weather_ConditionBlowing Snow / Windy 0.723665 0.945748 0.765
## Weather_ConditionClear 0.788309 0.299026 2.636
## Weather_ConditionCloudy 0.457016 0.299084 1.528
## Weather_ConditionCloudy / Windy 0.592156 0.311379 1.902
## Weather_ConditionDrizzle 0.588546 0.326187 1.804
## Weather_ConditionDrizzle and Fog -0.168449 0.944442 -0.178
## Weather_ConditionFair 0.364531 0.298914 1.220
## Weather_ConditionFair / Windy 0.199085 0.304970 0.653
## Weather_ConditionFog 0.429946 0.300835 1.429
## Weather_ConditionFog / Windy -0.110175 0.500483 -0.220
## Weather_ConditionFreezing Rain -0.333067 0.597735 -0.557
## Weather_ConditionFunnel Cloud -0.244806 0.944362 -0.259
## Weather_ConditionHaze 0.477030 0.300150 1.589
## Weather_ConditionHaze / Windy -0.091510 0.474989 -0.193
## Weather_ConditionHeavy Drizzle 0.919122 0.451627 2.035
## Weather_ConditionHeavy Rain 0.667117 0.302637 2.204
## Weather_ConditionHeavy Rain / Windy 0.310725 0.598039 0.520
## Weather_ConditionHeavy Snow 1.021533 0.319849 3.194
## Weather_ConditionHeavy Snow / Windy -1.005996 0.945111 -1.064
## Weather_ConditionHeavy Snow with Thunder -0.081674 0.944355 -0.086
## Weather_ConditionHeavy T-Storm 0.808705 0.343484 2.354
## Weather_ConditionHeavy T-Storm / Windy 0.754005 0.598942 1.259
## Weather_ConditionHeavy Thunderstorms and Rain 0.638855 0.339439 1.882
## Weather_ConditionIce Pellets 1.111673 0.451980 2.460
## Weather_ConditionLight Blowing Snow 2.062434 0.700434 2.945
## Weather_ConditionLight Drizzle 0.767960 0.304806 2.520
## Weather_ConditionLight Drizzle / Windy 1.099272 0.597912 1.839
## Weather_ConditionLight Freezing Drizzle 0.906861 0.352755 2.571
## Weather_ConditionLight Freezing Fog 1.217418 0.339730 3.583
## Weather_ConditionLight Freezing Rain 0.687171 0.314317 2.186
## Weather_ConditionLight Freezing Rain / Windy 1.614141 0.944756 1.709
## Weather_ConditionLight Ice Pellets 0.490788 0.422664 1.161
## Weather_ConditionLight Rain 0.626172 0.299246 2.092
## Weather_ConditionLight Rain / Windy 0.472835 0.322074 1.468
## Weather_ConditionLight Rain Shower 0.158610 0.538404 0.295
## Weather_ConditionLight Rain Showers 1.714527 0.700350 2.448
## Weather_ConditionLight Rain with Thunder 0.563364 0.320154 1.760
## Weather_ConditionLight Snow 0.760681 0.299892 2.537
## Weather_ConditionLight Snow / Windy 0.138058 0.339476 0.407
## Weather_ConditionLight Thunderstorms and Rain 0.586552 0.320172 1.832
## Weather_ConditionLow Drifting Snow 0.991784 0.944594 1.050
## Weather_ConditionMist 0.728105 0.333270 2.185
## Weather_ConditionMostly Cloudy 0.616038 0.298999 2.060
## Weather_ConditionMostly Cloudy / Windy 0.493516 0.313077 1.576
## Weather_ConditionN/A Precipitation 0.302666 0.395425 0.765
## Weather_ConditionOvercast 0.771820 0.299111 2.580
## Weather_ConditionPartly Cloudy 0.531379 0.299021 1.777
## Weather_ConditionPartly Cloudy / Windy 0.282014 0.315504 0.894
## Weather_ConditionPatches of Fog 0.322746 0.346793 0.931
## Weather_ConditionRain 0.632486 0.300383 2.106
## Weather_ConditionRain / Windy 0.513150 0.358282 1.432
## Weather_ConditionRain Showers 1.388556 0.700395 1.983
## Weather_ConditionSand / Dust Whirlwinds 0.615851 0.598082 1.030
## Weather_ConditionScattered Clouds 0.729811 0.299344 2.438
## Weather_ConditionShallow Fog 0.425411 0.402961 1.056
## Weather_ConditionShowers in the Vicinity 0.354527 0.354736 0.999
## Weather_ConditionSmoke 0.375980 0.306900 1.225
## Weather_ConditionSnow 0.734567 0.304868 2.409
## Weather_ConditionSnow / Windy 0.064024 0.701456 0.091
## Weather_ConditionSnow and Sleet -0.606934 0.944560 -0.643
## Weather_ConditionSnow and Sleet / Windy 0.503123 0.945600 0.532
## Weather_ConditionT-Storm 0.581050 0.318629 1.824
## Weather_ConditionT-Storm / Windy 0.136743 0.500441 0.273
## Weather_ConditionThunder 0.614114 0.328664 1.869
## Weather_ConditionThunder / Windy 1.414088 0.946520 1.494
## Weather_ConditionThunder and Hail / Windy 0.964570 0.944424 1.021
## Weather_ConditionThunder in the Vicinity 0.635240 0.318909 1.992
## Weather_ConditionThunderstorm 0.739377 0.320522 2.307
## Weather_ConditionThunderstorms and Rain 0.903090 0.345007 2.618
## Weather_ConditionTornado 1.041939 0.944239 1.103
## Weather_ConditionWintry Mix 0.283855 0.320145 0.887
## Weather_ConditionWintry Mix / Windy 0.691281 0.944548 0.732
## Crossing -0.243283 0.019498 -12.478
## Junction 0.107356 0.009194 11.677
## Traffic_Signal -0.597938 0.014143 -42.279
## Sunrise_SunsetNight -0.022987 0.013358 -1.721
## dayofweek2 -0.016947 0.014719 -1.151
## dayofweek3 -0.019297 0.014546 -1.327
## dayofweek4 -0.013905 0.014574 -0.954
## dayofweek5 -0.021491 0.014606 -1.471
## dayofweek6 -0.001399 0.014580 -0.096
## dayofweek7 -0.013149 0.016661 -0.789
## Pr(>|t|)
## (Intercept) 1.35e-08 ***
## Weather_ConditionBlowing Dust / Windy 0.821001
## Weather_ConditionBlowing Snow 0.001882 **
## Weather_ConditionBlowing Snow / Windy 0.444168
## Weather_ConditionClear 0.008384 **
## Weather_ConditionCloudy 0.126503
## Weather_ConditionCloudy / Windy 0.057211 .
## Weather_ConditionDrizzle 0.071185 .
## Weather_ConditionDrizzle and Fog 0.858442
## Weather_ConditionFair 0.222651
## Weather_ConditionFair / Windy 0.513886
## Weather_ConditionFog 0.152957
## Weather_ConditionFog / Windy 0.825765
## Weather_ConditionFreezing Rain 0.577382
## Weather_ConditionFunnel Cloud 0.795459
## Weather_ConditionHaze 0.111996
## Weather_ConditionHaze / Windy 0.847228
## Weather_ConditionHeavy Drizzle 0.041841 *
## Weather_ConditionHeavy Rain 0.027503 *
## Weather_ConditionHeavy Rain / Windy 0.603363
## Weather_ConditionHeavy Snow 0.001405 **
## Weather_ConditionHeavy Snow / Windy 0.287141
## Weather_ConditionHeavy Snow with Thunder 0.931080
## Weather_ConditionHeavy T-Storm 0.018554 *
## Weather_ConditionHeavy T-Storm / Windy 0.208073
## Weather_ConditionHeavy Thunderstorms and Rain 0.059827 .
## Weather_ConditionIce Pellets 0.013913 *
## Weather_ConditionLight Blowing Snow 0.003236 **
## Weather_ConditionLight Drizzle 0.011754 *
## Weather_ConditionLight Drizzle / Windy 0.065990 .
## Weather_ConditionLight Freezing Drizzle 0.010149 *
## Weather_ConditionLight Freezing Fog 0.000339 ***
## Weather_ConditionLight Freezing Rain 0.028801 *
## Weather_ConditionLight Freezing Rain / Windy 0.087543 .
## Weather_ConditionLight Ice Pellets 0.245573
## Weather_ConditionLight Rain 0.036397 *
## Weather_ConditionLight Rain / Windy 0.142083
## Weather_ConditionLight Rain Shower 0.768305
## Weather_ConditionLight Rain Showers 0.014363 *
## Weather_ConditionLight Rain with Thunder 0.078469 .
## Weather_ConditionLight Snow 0.011198 *
## Weather_ConditionLight Snow / Windy 0.684244
## Weather_ConditionLight Thunderstorms and Rain 0.066957 .
## Weather_ConditionLow Drifting Snow 0.293741
## Weather_ConditionMist 0.028912 *
## Weather_ConditionMostly Cloudy 0.039370 *
## Weather_ConditionMostly Cloudy / Windy 0.114951
## Weather_ConditionN/A Precipitation 0.444025
## Weather_ConditionOvercast 0.009871 **
## Weather_ConditionPartly Cloudy 0.075562 .
## Weather_ConditionPartly Cloudy / Windy 0.371404
## Weather_ConditionPatches of Fog 0.352033
## Weather_ConditionRain 0.035242 *
## Weather_ConditionRain / Windy 0.152076
## Weather_ConditionRain Showers 0.047423 *
## Weather_ConditionSand / Dust Whirlwinds 0.303149
## Weather_ConditionScattered Clouds 0.014770 *
## Weather_ConditionShallow Fog 0.291104
## Weather_ConditionShowers in the Vicinity 0.317599
## Weather_ConditionSmoke 0.220545
## Weather_ConditionSnow 0.015979 *
## Weather_ConditionSnow / Windy 0.927276
## Weather_ConditionSnow and Sleet 0.520513
## Weather_ConditionSnow and Sleet / Windy 0.594681
## Weather_ConditionT-Storm 0.068218 .
## Weather_ConditionT-Storm / Windy 0.784666
## Weather_ConditionThunder 0.061694 .
## Weather_ConditionThunder / Windy 0.135184
## Weather_ConditionThunder and Hail / Windy 0.307101
## Weather_ConditionThunder in the Vicinity 0.046384 *
## Weather_ConditionThunderstorm 0.021069 *
## Weather_ConditionThunderstorms and Rain 0.008857 **
## Weather_ConditionTornado 0.269827
## Weather_ConditionWintry Mix 0.375274
## Weather_ConditionWintry Mix / Windy 0.464253
## Crossing < 2e-16 ***
## Junction < 2e-16 ***
## Traffic_Signal < 2e-16 ***
## Sunrise_SunsetNight 0.085277 .
## dayofweek2 0.249593
## dayofweek3 0.184637
## dayofweek4 0.340023
## dayofweek5 0.141188
## dayofweek6 0.923575
## dayofweek7 0.430003
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Temp) 7.813 8.470 54.13 <2e-16 ***
## s(Wind) 7.164 7.857 28.67 <2e-16 ***
## s(Duration) 8.959 8.999 750.89 <2e-16 ***
## s(hour) 8.058 8.757 24.94 <2e-16 ***
## s(month) 8.909 8.998 387.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.297 Deviance explained = 29.9%
## GCV = 0.80335 Scale est. = 0.802 n = 75061
gam_mod %>% pluck('fit') %>% plot(col = darkRose,
col.sub = darkBlu)
spline_rec <- recipe(logDist ~ ., data = accidents) %>%
step_nzv(all_predictors()) %>%
step_other(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_ns( Wind,Duration,month,deg_free = 5) %>%
step_ns(hour, Temp, deg_free = 5)
# Check the pre-processed data
spline_rec %>% prep(accidents) %>% juice()
## # A tibble: 75,061 × 44
## Junction Traffic_Signal logDist Severity_X3 Severity_X4 Weather_Condition_Cl…
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 -0.591 0 0 0
## 2 0 0 -1.36 0 1 0
## 3 0 0 -0.498 1 0 0
## 4 0 0 -2.30 0 0 0
## 5 0 0 3.57 0 1 0
## 6 0 0 -0.0377 0 0 0
## 7 0 0 -2.30 0 0 0
## 8 0 1 -2.30 0 0 0
## 9 0 1 -1.38 0 1 0
## 10 1 0 -0.528 0 0 0
## # … with 75,051 more rows, and 38 more variables: Weather_Condition_Fair <dbl>,
## # Weather_Condition_Light.Rain <dbl>, Weather_Condition_Mostly.Cloudy <dbl>,
## # Weather_Condition_Overcast <dbl>, Weather_Condition_Partly.Cloudy <dbl>,
## # Weather_Condition_other <dbl>, Sunrise_Sunset_Night <dbl>,
## # dayofweek_X2 <dbl>, dayofweek_X3 <dbl>, dayofweek_X4 <dbl>,
## # dayofweek_X5 <dbl>, dayofweek_X6 <dbl>, dayofweek_X7 <dbl>,
## # Wind_ns_1 <dbl>, Wind_ns_2 <dbl>, Wind_ns_3 <dbl>, Wind_ns_4 <dbl>, …
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
# Workflow (Recipe + Model)
spline_wf <- workflow() %>%
add_recipe(spline_rec) %>%
add_model(lm_spec)
# CV to Evaluate
cv_output <- fit_resamples(
spline_wf, # workflow
resamples =accident_cv, # cv folds
metrics = metric_set(mae,rmse,rsq)
)
cv_output %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 0.661 10 0.00191 Preprocessor1_Model1
## 2 rmse standard 0.863 10 0.00228 Preprocessor1_Model1
## 3 rsq standard 0.348 10 0.00211 Preprocessor1_Model1
# Fit model
ns_mod <- spline_wf %>%
fit(data = accidents)
ns_mod %>%
tidy() %>%
arrange(desc(abs(statistic)))
## # A tibble: 44 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Severity_X4 0.678 0.0108 62.9 0
## 2 Traffic_Signal -0.705 0.0122 -57.9 0
## 3 month_ns_5 -0.439 0.0131 -33.5 6.03e-245
## 4 Weather_Condition_Fair -0.340 0.0118 -28.9 3.61e-183
## 5 Severity_X3 0.236 0.00971 24.3 4.26e-130
## 6 month_ns_1 0.506 0.0219 23.1 1.02e-117
## 7 Weather_Condition_Cloudy -0.266 0.0145 -18.4 4.54e- 75
## 8 Weather_Condition_Partly.Cloudy -0.204 0.0132 -15.5 6.54e- 54
## 9 Junction 0.115 0.00884 13.1 5.76e- 39
## 10 Weather_Condition_Mostly.Cloudy -0.141 0.0122 -11.5 7.86e- 31
## # … with 34 more rows
# Visualizing the Predictions (focusing on Duration, Weather_Condition)
data_grid <- expand.grid(Duration = seq(10,360, by=1), Temp = 30, Sunrise_Sunset = 'Day', Weather_Condition = c("Mostly Cloudy",'Clear','Snow'), Wind = 0, Junction = 0 , Traffic_Signal = 0, month = 1, hour = 5 , Vis = 10, Crossing = 0, dayofweek = "1" ,Severity = "2")
predicted_lines <- data_grid %>%
bind_cols(
predict(ns_mod, new_data = data_grid))
ggplot(predicted_lines, aes(x = Duration, y = .pred, color = Weather_Condition)) + geom_point() + geom_line()+ theme_classic() + ggtitle('Spline Model Predictions') +
scale_color_manual(values = c(darkRose, darkBlu, midPurp)) +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.background = element_blank(),
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu))
# Visualizing the Predictions (focusing on Hour & Sunrise_Sunset)
data_grid <- expand.grid(Duration = 30, Temp = 30, Sunrise_Sunset = c('Day','Night'), Weather_Condition = "Mostly Cloudy", Wind = 0, Junction = 0 , Traffic_Signal = 0, month = 1, hour = 0:23 , Vis = 10, Crossing = 0, dayofweek = "1", Severity = "2")
predicted_lines <- data_grid %>%
bind_cols(
predict(ns_mod, new_data = data_grid))
ggplot(predicted_lines, aes(x = hour, y = .pred, color = Sunrise_Sunset)) + geom_point() + geom_line()+ theme_classic() + ggtitle('Spline Model Predictions') +
scale_color_manual(values = c(darkRose, darkBlu)) +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.background = element_blank(),
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu))
# Visualizing the Predictions (focusing on Month & Severity)
data_grid <- expand.grid(Duration = 30, Temp = 30, Sunrise_Sunset = 'Day', Weather_Condition = "Mostly Cloudy", Wind = 0, Junction = 0 , Traffic_Signal = 0, month = 1:12, hour = 6 , Vis = 10, Crossing = 0, dayofweek = "1",Severity = c("2","3","4"))
predicted_lines <- data_grid %>%
bind_cols(
predict(ns_mod, new_data = data_grid)
)
ggplot(predicted_lines, aes(x = month, y = .pred,color = Severity)) + geom_point() + geom_line()+ theme_classic() + ggtitle('Spline Model Predictions') +
scale_color_manual(values = c(darkRose, darkBlu, midPurp)) +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.background = element_blank(),
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu))
#OLS Model (5 variables)
mod1_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 0.883 10 0.00143 Preprocessor1_Model1
## 2 rmse standard 1.05 10 0.00199 Preprocessor1_Model1
## 3 rsq standard 0.0373 10 0.000975 Preprocessor1_Model1
#Lasso Model
tune_output %>% collect_metrics() %>%
filter(penalty == (best_se_penalty %>% pull(penalty)))
## # A tibble: 3 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00356 mae standard 0.733 10 0.00185 Preprocessor1_Model05
## 2 0.00356 rmse standard 0.916 10 0.00236 Preprocessor1_Model05
## 3 0.00356 rsq standard 0.265 10 0.00253 Preprocessor1_Model05
#Spline Model
cv_output %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 0.661 10 0.00191 Preprocessor1_Model1
## 2 rmse standard 0.863 10 0.00228 Preprocessor1_Model1
## 3 rsq standard 0.348 10 0.00211 Preprocessor1_Model1
Compare insights from variable importance analyses here and the corresponding results from the Investigation 1. Now after having accounted for nonlinearity, have the most relevant predictors changed? -Note that if some (but not all) of the spline terms are selected in the final models, the whole predictor should be treated as selected.
Fit a GAM using spline terms using the set of variables deemed to be most relevant based on your investigations so far. -How does test performance of the GAM compare to other models you explored? -Do you gain any insights from the GAM output plots for each predictor?
Summarize investigations -Decide on an overall best model based on your investigations so far. To do this, make clear your analysis goals. Predictive accuracy? Interpretability? A combination of both?
Societal impact -Are there any harms that may come from your analyses and/or how the data were collected? -What cautions do you want to keep in mind when communicating your work?
Specify the research question for a classification task.
What is the best predictor of the length of traffic caused by an accident?
Try to implement at least 2 different classification methods to answer your research question.
set.seed(123) # don't change this
tree_fold <- vfold_cv(accidents, v = 10)
ct_spec_tune <- decision_tree() %>%
set_engine(engine = 'rpart') %>%
set_args(cost_complexity = tune(),
min_n = 2,
tree_depth = 3) %>%
set_mode('classification')
tree_rec <- recipe(Severity ~ ., data = accidents) %>%
step_nzv(all_predictors()) %>%
step_other(all_nominal_predictors())
tree_wf_tune <- workflow() %>%
add_model(ct_spec_tune) %>%
add_recipe(tree_rec)
param_grid <- grid_regular(cost_complexity(range = c(-5, 1)), levels = 10)
tree_tune_res <- tune_grid(
tree_wf_tune,
resamples = tree_fold,
grid = param_grid,
metrics = metric_set(accuracy) #change this for regression trees
)
autoplot(tree_tune_res) + theme_classic()
best_complexity <- select_by_one_std_err(tree_tune_res, metric = 'accuracy', desc(cost_complexity))
data_wf_final <- finalize_workflow(tree_wf_tune, best_complexity)
tree_final_fit <- fit(data_wf_final, data = accidents)
# CV Metrics
tree_tune_res %>%
collect_metrics() %>%
filter(cost_complexity == best_complexity %>% pull(cost_complexity))
## # A tibble: 1 × 7
## cost_complexity .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.001 accuracy multiclass 0.752 10 0.00178 Preprocessor1_Model04
#Tree visual
par(bg = rose)
tree_final_fit %>% extract_fit_engine() %>% rpart.plot(box.palette = list(roses, blues))
rf_spec <- rand_forest() %>%
set_engine(engine = 'ranger') %>%
set_args(mtry = NULL, # size of random subset of variables; default is floor(sqrt(number of total predictors))
trees = 500, # Number of trees
min_n = 50,
probability = FALSE, # FALSE: get hard predictions (not needed for regression)
importance = 'impurity') %>% # we'll come back to this at the end
set_mode('classification') # change this for regression
# Recipe is tree_rec
# Workflows
data_wf <- workflow() %>%
add_model(rf_spec) %>%
add_recipe(tree_rec)
data_rf_fit <- fit(data_wf, data = accidents)
data_rf_OOB_output <- tibble(
.pred_class = data_rf_fit %>% extract_fit_engine() %>% pluck('predictions'), #OOB predictions
class = accidents %>% pull(Severity)
)
## OOB Metrics
data_rf_OOB_output %>%
accuracy(truth = class, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.764
data_rf_OOB_output %>%
conf_mat(truth = class, estimate = .pred_class)
## Truth
## Prediction 2 3 4
## 2 54555 8782 5959
## 3 749 1211 731
## 4 678 806 1590
#Variable Importance - Impurity
model_output <- data_rf_fit %>%
extract_fit_engine()
model_output %>%
vip::vip(num_features = 30) + theme_classic() #based on impurity
#Variable Importance - Permutation
model_output2 <- data_wf %>%
update_model(rf_spec %>% set_args(importance = "permutation")) %>% #based on permutation
fit(data = accidents) %>%
extract_fit_engine()
model_output2 %>%
vip::vip(num_features = 30) + theme_classic() +
scale_color_manual(values = c(darkRose, darkBlu, midPurp)) +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.background = element_blank(),
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu),
rect = element_rect(color = darkRose))
# Visualizing the Predictions (focusing on Duration, Month, Hour)
data_grid <- expand.grid(Duration = c(10,15,30,60,120,240), Temp = 30, Sunrise_Sunset = 'Day', Weather_Condition = "Mostly Cloudy", Wind = 0, Junction = 0 , Traffic_Signal = 0, month = 1:12, hour = 0:23 , Vis = 10, Crossing = 0, logDist = 0, dayofweek = "1")
predicted_lines <- data_grid %>%
bind_cols(
predict(tree_final_fit, new_data = data_grid)
)
ggplot(predicted_lines,aes(x = hour, y = month, color = .pred_class)) + geom_point() + facet_wrap(~Duration) + theme_classic() + ggtitle('Classification Tree Predictions')
predicted_lines <- data_grid %>%
bind_cols(
predict(data_rf_fit, new_data = data_grid)
)
ggplot(predicted_lines,aes(x = hour, y = month, color = .pred_class)) + geom_point() + facet_wrap(~Duration) + theme_classic() + ggtitle('Random Forest Predictions') +
scale_color_manual(values = c("#8c3561", "#2d0d52", "#4c85ba")) +
labs(color = "Severity") +
theme(panel.background = element_rect(rose),
plot.background = element_rect(rose),
legend.background = element_blank(),
strip.background = element_blank(),
strip.text = element_text(color = darkBlu),
text = element_text(color = darkBlu, face = "bold"),
axis.text = element_text(color = darkBlu, face = "bold"),
axis.line = element_line(color = darkBlu),
axis.ticks = element_line(color = darkBlu))
#CLUSTERING
# Random subsample of 50 penguins
set.seed(253)
accidents_Sub <- accidents %>%
slice_sample(n = 50)
# Select the variables to be used in clustering
accidents_Clust <- accidents_Sub %>%
select(Severity, Temp, Vis, Wind, Weather_Condition, Crossing, Junction, Traffic_Signal, Sunrise_Sunset, logDist, Duration, dayofweek, month, hour) %>%
mutate(Weather_Condition = as.factor(Weather_Condition), Sunrise_Sunset = as.factor(Sunrise_Sunset)) %>%
mutate(Crossing = as.factor(Crossing), Junction = as.factor(Junction), Traffic_Signal = as.factor(Traffic_Signal))
# Summary statistics for the variables
summary(accidents_Clust)
## Severity Temp Vis Wind
## 2:38 Min. :-0.90 Min. : 0.200 Min. : 0.000
## 3: 6 1st Qu.:50.75 1st Qu.:10.000 1st Qu.: 3.775
## 4: 6 Median :64.00 Median :10.000 Median : 7.000
## Mean :61.80 Mean : 9.002 Mean : 7.652
## 3rd Qu.:77.00 3rd Qu.:10.000 3rd Qu.:12.000
## Max. :97.00 Max. :10.000 Max. :20.000
##
## Weather_Condition Crossing Junction Traffic_Signal Sunrise_Sunset
## Fair :14 0:48 0:39 0:48 Day :35
## Clear : 7 1: 2 1:11 1: 2 Night:15
## Overcast : 7
## Mostly Cloudy: 6
## Partly Cloudy: 5
## Cloudy : 3
## (Other) : 8
## logDist Duration dayofweek month hour
## Min. :-2.3026 Min. : 26.0 1: 4 Min. : 1.00 Min. : 2.00
## 1st Qu.:-1.2832 1st Qu.: 29.0 2: 9 1st Qu.: 6.00 1st Qu.: 8.25
## Median :-0.8849 Median : 30.0 3: 6 Median : 9.00 Median :12.50
## Mean :-0.9001 Mean :118.6 4:10 Mean : 8.12 Mean :12.80
## 3rd Qu.:-0.4631 3rd Qu.:240.0 5: 8 3rd Qu.:11.00 3rd Qu.:17.00
## Max. : 1.3870 Max. :360.0 6:11 Max. :12.00 Max. :23.00
## 7: 2
#Daisy--standardizes variables using Gauer's distance
accidents_ClustDaze <- hclust(daisy(accidents_Clust), method = 'complete')
accidents_ClustDaze %>% plot()
summary(accidents_ClustDaze)
## Length Class Mode
## merge 98 -none- numeric
## height 49 -none- numeric
## order 50 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 3 -none- call
## dist.method 0 -none- NULL
accidents_ClustDaze %>% plot()
Reflect on the information gained from these two methods and how you might justify this method to others.
Keep in mind that the final project will require you to complete the pieces below. Use this as a guide for your work but don’t try to accomplish everything for HW4:
Classification - Methods Indicate at least 2 different methods used to answer your classification research question. Describe what you did to evaluate the models explored. Indicate how you estimated quantitative evaluation metrics. Describe the goals / purpose of the methods used in the overall context of your research investigations.
Classification - Results Summarize your final model and justify your model choice (see below for ways to justify your choice). Compare the different classification models tried in light of evaluation metrics, variable importance, and data context. Display evaluation metrics for different models in a clean, organized way. This display should include both the estimated metric as well as its standard deviation. (This won’t be available from OOB error estimation. If using OOB, don’t worry about reporting the SD.) Broadly summarize conclusions from looking at these evaluation metrics and their measures of uncertainty.
Classification - Conclusions - Interpret evaluation metric(s) for the final model in context. Does the model show an acceptable amount of error? - If using OOB error estimation, display the test (OOB) confusion matrix, and use it to interpret the strengths and weaknesses of the final model. - Summarization should show evidence of acknowledging the data context in thinking about the sensibility of these results.